2 Exploratory Analysis

2.1 Term Frequencies

This initial exploration of term frequencies allowed us to find some extraordinarily common words for this corpus that were added to the stop list. Not all common words were added to the stop list, however. Some common words may still have found relevance in determining certain groups of documents (an example would be the abbreviation ‘vs’ which indicates a comparison, most notably from an earnings report.)

Documents per word

bin = weightBin(tdm)
df=data.frame(docFreqs = row_sums(bin))
g1 = ggplot(df,aes(x=docFreqs)) + 
    geom_histogram(aes(y=..density..), alpha=0.5) + geom_density( alpha = 0.2) +
    labs(x = "Number of Documents in which \n a Word Appears")  
df=data.frame(docFreqs=df$docFreqs[df$docFreqs<100] )
g2 =  ggplot(df,aes(x=docFreqs)) + 
    geom_histogram(aes(y=..density..), alpha=0.5) + geom_density( alpha = 0.2) +
    labs(x = "Number of Documents in which \n a Word Appears", y='Frequency',
    title='Same Distribution Cut at x=100')
grid.arrange(g1,g2,ncol=2)

TF-IDF per word

tfi = weightTfIdf(tdm)
df = data.frame(termFreqs = row_sums(tfi))

g1 = ggplot(df,aes(x=termFreqs)) + 
    geom_histogram(aes(y=..density..), alpha=0.5) + geom_density( alpha = 0.2) +
    labs(x = "Sum of TF-IDF Weights \n for each words")  
df=data.frame(termFreqs=df$termFreqs[df$termFreqs<100] )
g2 =  ggplot(df,aes(x=termFreqs)) + 
    geom_histogram(aes(y=..density..), alpha=0.5) + geom_density( alpha = 0.2) +
    labs(x = "Sum of TF-IDF Weights \n for each word", y='Frequency',
    title='Same Distribution Cut at x=100')
grid.arrange(g1,g2,ncol=2)

## via SVD

# tfidf_tdm = weightTfIdf(tdm, normalize=T)
# m =  Matrix::sparseMatrix(i=tfidf_tdm$i,
#                            j=tfidf_tdm$j,
#                            x=tfidf_tdm$v,
#                            dims=c(tfidf_tdm$nrow, tfidf_tdm$ncol),
#                            dimnames = tfidf_tdm$dimnames)
# svd = irlba(m, 150)
# save(svd,file='svd.RData')
load('docs/final_data_plots/svd.RData')
df = data.frame(x=1:150,d=svd$d)
g1 = ggplot(data=df, aes(x=x, y=d, group=1)) +
  geom_line(color="red")+labs(y='Singular Values',x='index', 
                              title='Screeplot of Reuters tf-idf Matrix, vlines at 10, 25') + 
  geom_point() + 
  geom_vline(xintercept = 25, linetype="dotted",  color = "blue", size=1) + 
  geom_vline(xintercept = 10, linetype="dotted", color = "blue", size=1)
u.df = data.frame(x=svd$v[,1], y=svd$v[,2])
g2 = ggplot(data=u.df, aes(x=x, y=y)) +
  geom_point()+labs(y='Second Singular Component',x='First Singular Component',
                    title='SVD Projection of Reuters tf-idf Term-Document Matrix') 
g1

fig <- plot_ly(type = 'scatter', mode = 'markers')
fig <- fig %>%
  add_trace(
    x = svd$v[,1],
    y = svd$v[,2],
    text = ~paste('heading:', head ,"$<br>text: ", raw_text  ),
    hoverinfo = 'text',
    marker = list(color='green', opacity=0.6),
    showlegend = F
  )

fig

Our initial Creation of this SVD Projection allowed us to see that we had an issue with briefs vs. articles in this data.